Introduction

This analysis examines diplomatic recognition patterns among countries from 1960 to 2020. We’ll analyze how many other countries recognize each country per year, track changes in recognition over time, and identify global recognition patterns.

Data Loading and Preparation

First, let’s load the diplomatic representation data:

# Load the diplomatic data
diplomacy_data <- read_excel("/Users/yutianyi/Desktop/MA thesis data creation/Diplometrics Diplomatic Representation 1960-2020_20211215.xlsx", 
                            sheet = "Data")

# Also load country codes reference for additional context
cow_continents <- read.csv("cow_continent_crosswalk_complete.csv")
states_info <- read.csv("states2016.csv")
nmc_data<-read.csv("/Users/yutianyi/Desktop/MA thesis data creation/NMC_Documentation-6.0/NMC-60-abridged/NMC-60-abridged.csv")
# Display the structure of the main dataset
str(diplomacy_data)
## tibble [413,582 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Destination          : chr [1:413582] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Destination Region   : chr [1:413582] "Asia" "Asia" "Asia" "Asia" ...
##  $ Destination Subregion: chr [1:413582] "Southern Asia" "Southern Asia" "Southern Asia" "Southern Asia" ...
##  $ Sending Country      : chr [1:413582] "Czechoslovakia" "Egypt" "France" "Germany" ...
##  $ Sending Region       : chr [1:413582] "Europe" "Africa" "Europe" "Europe" ...
##  $ Sending Subregion    : chr [1:413582] "Eastern Europe" "Northern Africa" "Western Europe" "Western Europe" ...
##  $ Year                 : num [1:413582] 1960 1960 1960 1960 1960 1960 1960 1960 1960 1960 ...
##  $ Location             : chr [1:413582] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Embassy              : num [1:413582] 5 6 6 6 6 4 6 5 6 6 ...
##  $ Focus                : num [1:413582] 1 1 1 1 1 1 1 1 1 1 ...
##  $ EmbassyFocus         : num [1:413582] 51 61 61 61 61 41 61 51 61 61 ...
##  $ LOR                  : num [1:413582] 0.75 1 1 1 1 0.75 1 0.75 1 1 ...
# Show a sample of the data
head(diplomacy_data)
## # A tibble: 6 × 12
##   Destination `Destination Region` `Destination Subregion` `Sending Country`
##   <chr>       <chr>                <chr>                   <chr>            
## 1 Afghanistan Asia                 Southern Asia           Czechoslovakia   
## 2 Afghanistan Asia                 Southern Asia           Egypt            
## 3 Afghanistan Asia                 Southern Asia           France           
## 4 Afghanistan Asia                 Southern Asia           Germany          
## 5 Afghanistan Asia                 Southern Asia           India            
## 6 Afghanistan Asia                 Southern Asia           Indonesia        
## # ℹ 8 more variables: `Sending Region` <chr>, `Sending Subregion` <chr>,
## #   Year <dbl>, Location <chr>, Embassy <dbl>, Focus <dbl>, EmbassyFocus <dbl>,
## #   LOR <dbl>

Data Transformation

Let’s transform the data to count recognitions per country per year:

# Count how many countries recognize each destination country per year
# We consider a country "recognized" if it has any diplomatic representation (Embassy > 0)
country_recognition <- diplomacy_data %>%
  filter(!is.na(Embassy) & Embassy > 0) %>%
  group_by(Destination, Year) %>%
  summarize(
    recognition_count = n_distinct(`Sending Country`),
    avg_embassy_level = mean(Embassy, na.rm = TRUE),
    avg_lor = mean(LOR, na.rm = TRUE),
    .groups = "drop"
  )

# Get the number of potential sending countries per year
# This helps normalize the recognition counts
total_senders_by_year <- diplomacy_data %>%
  group_by(Year) %>%
  summarize(
    total_potential_senders = n_distinct(`Sending Country`),
    .groups = "drop"
  )

# Join the data to get recognition percentage
country_recognition <- country_recognition %>%
  left_join(total_senders_by_year, by = "Year") %>%
  mutate(recognition_percentage = (recognition_count / total_potential_senders) * 100)

# Display the transformed data
head(country_recognition)
## # A tibble: 6 × 7
##   Destination  Year recognition_count avg_embassy_level avg_lor
##   <chr>       <dbl>             <int>             <dbl>   <dbl>
## 1 Afghanistan  1960                16              5.31   0.906
## 2 Afghanistan  1961                18              5.22   0.889
## 3 Afghanistan  1962                18              5.94   0.931
## 4 Afghanistan  1963                13              1      0.75 
## 5 Afghanistan  1964                14              1      0.75 
## 6 Afghanistan  1965                14              1      0.75 
## # ℹ 2 more variables: total_potential_senders <int>,
## #   recognition_percentage <dbl>

Analyzing Recognition by Country

Let’s look at recognition patterns for specific countries:

# Select a few countries of interest for detailed analysis
countries_of_interest <- c("United States", "Russia", "China", "India", "Brazil", 
                          "South Africa", "Japan", "Germany", "France", "United Kingdom")

# Filter data for these countries
selected_countries <- country_recognition %>%
  filter(Destination %in% countries_of_interest)

# Plot recognition trends for selected countries
ggplot(selected_countries, aes(x = Year, y = recognition_percentage, color = Destination)) +
  geom_line(size = 1) +
  labs(
    title = "Diplomatic Recognition Trends for Selected Countries",
    subtitle = "Percentage of countries providing diplomatic recognition",
    x = "Year",
    y = "Recognition Percentage",
    color = "Country"
  ) +
  theme_minimal() +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  theme(legend.position = "right")

Regional Recognition Analysis

Let’s examine recognition patterns by region using our continents data:

# Join with continents data to analyze by region
# First, create a simplified version of the cow_continents data
continent_mapping <- cow_continents %>%
  select(statenme, continent) %>%
  distinct()

# Rename to match our destination column
continent_mapping <- continent_mapping %>%
  rename(Destination = statenme)

# Join with our recognition data
recognition_by_continent <- country_recognition %>%
  left_join(continent_mapping, by = "Destination")

# Remove entries without continent information
recognition_by_continent <- recognition_by_continent %>%
  filter(!is.na(continent))

# Calculate average recognition by continent and year
continent_summary <- recognition_by_continent %>%
  group_by(continent, Year) %>%
  summarize(
    avg_recognition_percentage = mean(recognition_percentage, na.rm = TRUE),
    countries_in_region = n(),
    .groups = "drop"
  )

# Plot regional recognition trends
ggplot(continent_summary, aes(x = Year, y = avg_recognition_percentage, color = continent)) +
  geom_line(size = 1) +
  labs(
    title = "Average Diplomatic Recognition by Continent (1960-2020)",
    subtitle = "Average percentage of countries receiving diplomatic recognition",
    x = "Year",
    y = "Average Recognition Percentage",
    color = "Continent"
  ) +
  theme_minimal() +
  scale_y_continuous(labels = function(x) paste0(x, "%"))

# Step 1: Use states2016.csv as a crosswalk to map country names to codes
# This gives us the ccode for each country in the recognition dataset
recognition_with_ccode <- country_recognition %>%
  left_join(states_info %>% select(statenme, ccode), 
            by = c("Destination" = "statenme"))

# Examine which countries couldn't be matched (NA values)
unmatched_countries <- recognition_with_ccode %>%
  filter(is.na(ccode)) %>%
  select(Destination) %>%
  distinct() %>%
  arrange(Destination)

# View the count and list of unmatched countries
print(paste("Number of unmatched countries:", nrow(unmatched_countries)))
## [1] "Number of unmatched countries: 23"
print("Unmatched countries:")
## [1] "Unmatched countries:"
print(unmatched_countries)
## # A tibble: 23 × 1
##    Destination                           
##    <chr>                                 
##  1 Antigua and Barbuda                   
##  2 Cabo Verde                            
##  3 Congo-Brazzaville                     
##  4 Congo-Kinshasa                        
##  5 Côte d’Ivoire                         
##  6 Gambia, The                           
##  7 Holy See (Vatican)                    
##  8 Korea, Democratic People's Republic of
##  9 Korea, Republic of                    
## 10 Lao People's Democratic Republic      
## # ℹ 13 more rows
library(countrycode)

# Apply countrycode to standardize country names
recognition_with_ccode <- country_recognition %>%
  # First convert Destination names to ISO country codes
  mutate(
    ccode = countrycode(
      sourcevar = Destination,
      origin = "country.name",  # Input is country names
      destination = "cown",     # Convert to Correlates of War codes
      warn = FALSE              # Suppress warnings for unmatched countries
    )
  )

# Check which countries could not be matched
unmatched_countries <- recognition_with_ccode %>%
  filter(is.na(ccode)) %>%
  select(Destination) %>%
  distinct() %>%
  arrange(Destination)

# Print the results
print(paste("Number of countries matched:", 
            n_distinct(recognition_with_ccode$Destination) - nrow(unmatched_countries)))
## [1] "Number of countries matched: 203"
print(paste("Number of unmatched countries:", nrow(unmatched_countries)))
## [1] "Number of unmatched countries: 2"
# After initial join with countrycode or states_info
recognition_with_ccode <- recognition_with_ccode %>%
  mutate(
    ccode = case_when(
      Destination == "Serbia" ~ 345,
      Destination == "Serbia and Montenegro" ~ 345,
      TRUE ~ ccode
    )
  )

# Check that the replacement worked
serbia_check <- recognition_with_ccode %>%
  filter(Destination %in% c("Serbia", "Serbia and Montenegro")) %>%
  select(Destination, ccode) %>%
  distinct()

print(serbia_check)
## # A tibble: 2 × 2
##   Destination           ccode
##   <chr>                 <dbl>
## 1 Serbia                  345
## 2 Serbia and Montenegro   345
# After initial join with countrycode or states_info
recognition_with_ccode <- recognition_with_ccode %>%
  mutate(
    ccode = case_when(
      Destination == "Serbia" ~ 345,
      Destination == "Serbia and Montenegro" ~ 345,
      TRUE ~ ccode
    )
  )

# Check that the replacement worked
serbia_check <- recognition_with_ccode %>%
  filter(Destination %in% c("Serbia", "Serbia and Montenegro")) %>%
  select(Destination, ccode) %>%
  distinct()

print(serbia_check)
## # A tibble: 2 × 2
##   Destination           ccode
##   <chr>                 <dbl>
## 1 Serbia                  345
## 2 Serbia and Montenegro   345
integrated_data <- recognition_with_ccode %>%
  # Filter out any remaining NA codes (if you want to only keep matched countries)
  filter(!is.na(ccode)) %>%
  # Join with NMC data on both country code AND year
  inner_join(
    nmc_data %>% select(ccode, year, milex, milper, irst, pec, tpop, upop, cinc),
    by = c("ccode" = "ccode", "Year" = "year")
  )

# Check the result
print(paste("Number of rows in integrated dataset:", nrow(integrated_data)))
## [1] "Number of rows in integrated dataset: 9124"
print(paste("Number of countries in integrated dataset:", n_distinct(integrated_data$Destination)))
## [1] "Number of countries in integrated dataset: 204"
print(paste("Year range in integrated dataset:", min(integrated_data$Year), "to", max(integrated_data$Year)))
## [1] "Year range in integrated dataset: 1960 to 2016"
# Display the first few rows
head(integrated_data)
## # A tibble: 6 × 15
##   Destination  Year recognition_count avg_embassy_level avg_lor
##   <chr>       <dbl>             <int>             <dbl>   <dbl>
## 1 Afghanistan  1960                16              5.31   0.906
## 2 Afghanistan  1961                18              5.22   0.889
## 3 Afghanistan  1962                18              5.94   0.931
## 4 Afghanistan  1963                13              1      0.75 
## 5 Afghanistan  1964                14              1      0.75 
## 6 Afghanistan  1965                14              1      0.75 
## # ℹ 10 more variables: total_potential_senders <int>,
## #   recognition_percentage <dbl>, ccode <dbl>, milex <int>, milper <int>,
## #   irst <int>, pec <int>, tpop <dbl>, upop <dbl>, cinc <dbl>
integrated_data_with_continent <- integrated_data %>%
  # Use ccode for joining since both datasets have this field
  left_join(
    cow_continents %>% select(ccode, continent),
    by = c("ccode" = "ccode")
  )

# Check how many entries couldn't be matched with a continent
print(paste("Rows without continent information:", 
            sum(is.na(integrated_data_with_continent$continent))))
## [1] "Rows without continent information: 9"
# If there are missing continents, examine which countries are missing
if(sum(is.na(integrated_data_with_continent$continent)) > 0) {
  missing_continents <- integrated_data_with_continent %>%
    filter(is.na(continent)) %>%
    select(Destination, ccode) %>%
    distinct()
  
  print("Countries missing continent information:")
  print(head(missing_continents, 10))
}
## [1] "Countries missing continent information:"
## # A tibble: 1 × 2
##   Destination ccode
##   <chr>       <dbl>
## 1 Kosovo        347
# Check the result
print(paste("Number of countries in final dataset:", 
            n_distinct(integrated_data_with_continent$Destination)))
## [1] "Number of countries in final dataset: 204"
print(paste("Number of continents represented:", 
            n_distinct(integrated_data_with_continent$continent, na.rm = TRUE)))
## [1] "Number of continents represented: 6"
# Assign "Europe" to Kosovo (ccode 347) where continent is NA
integrated_data_with_continent <- integrated_data_with_continent %>%
  mutate(
    continent = case_when(
      ccode == 347 & is.na(continent) ~ "Europe",
      TRUE ~ continent
    )
  )

# Check that Kosovo now has Europe assigned
kosovo_check <- integrated_data_with_continent %>%
  filter(Destination == "Kosovo") %>%
  select(Destination, ccode, continent) %>%
  distinct()

print(kosovo_check)
## # A tibble: 1 × 3
##   Destination ccode continent
##   <chr>       <dbl> <chr>    
## 1 Kosovo        347 Europe
# Display continent distribution
continent_distribution <- integrated_data_with_continent %>%
  filter(!is.na(continent)) %>%
  select(Destination, continent) %>%
  distinct() %>%
  group_by(continent) %>%
  summarize(country_count = n()) %>%
  arrange(desc(country_count))

print("Country distribution by continent:")
## [1] "Country distribution by continent:"
print(continent_distribution)
## # A tibble: 6 × 2
##   continent     country_count
##   <chr>                 <int>
## 1 Africa                   54
## 2 Asia                     51
## 3 Europe                   50
## 4 North America            23
## 5 Oceania                  14
## 6 South America            12
# The final dataset with all information
head(integrated_data_with_continent)
## # A tibble: 6 × 16
##   Destination  Year recognition_count avg_embassy_level avg_lor
##   <chr>       <dbl>             <int>             <dbl>   <dbl>
## 1 Afghanistan  1960                16              5.31   0.906
## 2 Afghanistan  1961                18              5.22   0.889
## 3 Afghanistan  1962                18              5.94   0.931
## 4 Afghanistan  1963                13              1      0.75 
## 5 Afghanistan  1964                14              1      0.75 
## 6 Afghanistan  1965                14              1      0.75 
## # ℹ 11 more variables: total_potential_senders <int>,
## #   recognition_percentage <dbl>, ccode <dbl>, milex <int>, milper <int>,
## #   irst <int>, pec <int>, tpop <dbl>, upop <dbl>, cinc <dbl>, continent <chr>
# Load the Polity5 data
polity5_data <- read_excel("p5v2018 2.xls", sheet = "p5v2018")

# Filter to only include years 1960-2016
polity5_filtered <- polity5_data %>%
  filter(year >= 1960 & year <= 2016)

# Check the result
print(paste("Original Polity5 data rows:", nrow(polity5_data)))
## [1] "Original Polity5 data rows: 17574"
print(paste("Filtered Polity5 data rows (1960-2016):", nrow(polity5_filtered)))
## [1] "Filtered Polity5 data rows (1960-2016): 8464"
print(paste("Number of countries in filtered data:", n_distinct(polity5_filtered$ccode)))
## [1] "Number of countries in filtered data: 180"
print(paste("Year range in filtered data:", min(polity5_filtered$year), "to", max(polity5_filtered$year)))
## [1] "Year range in filtered data: 1960 to 2016"
# View the key variables we might be interested in
polity5_summary <- polity5_filtered %>%
  select(ccode, country, year, democ, autoc, polity, polity2, durable, xconst) %>%
  head(10)

print("Sample of filtered Polity5 data:")
## [1] "Sample of filtered Polity5 data:"
print(polity5_summary)
## # A tibble: 10 × 9
##    ccode country      year democ autoc polity polity2 durable xconst
##    <dbl> <chr>       <dbl> <dbl> <dbl>  <dbl>   <dbl>   <dbl>  <dbl>
##  1   700 Afghanistan  1960     0    10    -10     -10      NA      1
##  2   700 Afghanistan  1961     0    10    -10     -10      NA      1
##  3   700 Afghanistan  1962     0    10    -10     -10      NA      1
##  4   700 Afghanistan  1963     0    10    -10     -10      NA      1
##  5   700 Afghanistan  1964     0     7     -7      -7       0      3
##  6   700 Afghanistan  1965     0     7     -7      -7       1      3
##  7   700 Afghanistan  1966     0     7     -7      -7       2      3
##  8   700 Afghanistan  1967     0     7     -7      -7       3      3
##  9   700 Afghanistan  1968     0     7     -7      -7       4      3
## 10   700 Afghanistan  1969     0     7     -7      -7       5      3
# Join Polity5 data with the integrated data
fully_integrated_data <- integrated_data_with_continent %>%
  left_join(
    polity5_filtered %>% 
      select(ccode, year, democ, autoc, polity, polity2, durable, xconst),
    by = c("ccode" = "ccode", "Year" = "year")
  )

# Check the join results
print(paste("Rows in fully integrated dataset:", nrow(fully_integrated_data)))
## [1] "Rows in fully integrated dataset: 9124"
print(paste("Rows with Polity data:", sum(!is.na(fully_integrated_data$polity))))
## [1] "Rows with Polity data: 8120"
print(paste("Percentage with Polity data:", 
            round(mean(!is.na(fully_integrated_data$polity)) * 100, 1), "%"))
## [1] "Percentage with Polity data: 89 %"
# The final fully integrated dataset now includes:
# - Recognition data 
# - Material capabilities data (NMC)
# - Continent information
# - Regime characteristics (Polity5)
head(fully_integrated_data)
## # A tibble: 6 × 22
##   Destination  Year recognition_count avg_embassy_level avg_lor
##   <chr>       <dbl>             <int>             <dbl>   <dbl>
## 1 Afghanistan  1960                16              5.31   0.906
## 2 Afghanistan  1961                18              5.22   0.889
## 3 Afghanistan  1962                18              5.94   0.931
## 4 Afghanistan  1963                13              1      0.75 
## 5 Afghanistan  1964                14              1      0.75 
## 6 Afghanistan  1965                14              1      0.75 
## # ℹ 17 more variables: total_potential_senders <int>,
## #   recognition_percentage <dbl>, ccode <dbl>, milex <int>, milper <int>,
## #   irst <int>, pec <int>, tpop <dbl>, upop <dbl>, cinc <dbl>, continent <chr>,
## #   democ <dbl>, autoc <dbl>, polity <dbl>, polity2 <dbl>, durable <dbl>,
## #   xconst <dbl>
# Find countries where all polity data is NA across all years
countries_missing_polity <- fully_integrated_data %>%
  # Group by country
  group_by(Destination, ccode) %>%
  # Check if ALL rows for this country have NA for all polity variables
  summarize(
    row_count = n(),
    all_polity_missing = all(is.na(democ) & is.na(autoc) & is.na(polity) & 
                             is.na(polity2) & is.na(durable) & is.na(xconst)),
    .groups = "drop"
  ) %>%
  # Keep only countries where all polity data is missing
  filter(all_polity_missing == TRUE) %>%
  # Sort by country name
  arrange(Destination)

# Print the results
print(paste("Number of countries with no polity data:", nrow(countries_missing_polity)))
## [1] "Number of countries with no polity data: 32"
print("Countries missing all polity data:")
## [1] "Countries missing all polity data:"
print(countries_missing_polity)
## # A tibble: 32 × 4
##    Destination         ccode row_count all_polity_missing
##    <chr>               <dbl>     <int> <lgl>             
##  1 Andorra               232        23 TRUE              
##  2 Antigua and Barbuda    58        32 TRUE              
##  3 Bahamas                31        41 TRUE              
##  4 Barbados               53        50 TRUE              
##  5 Belize                 80        34 TRUE              
##  6 Brunei                835        33 TRUE              
##  7 Dominica               54        30 TRUE              
##  8 Grenada                55        37 TRUE              
##  9 Iceland               395        57 TRUE              
## 10 Kiribati              946        18 TRUE              
## # ℹ 22 more rows
# Alternatively, you could check which states have ANY polity data
countries_with_polity <- fully_integrated_data %>%
  group_by(Destination, ccode) %>%
  summarize(
    has_any_polity_data = any(!is.na(polity) | !is.na(polity2)),
    .groups = "drop"
  ) %>%
  filter(has_any_polity_data == FALSE)

# Compare the counts
print(paste("Number of countries with no polity or polity2 data:", 
            nrow(countries_with_polity)))
## [1] "Number of countries with no polity or polity2 data: 32"
# First, let's check the year ranges for each Vietnam entity
vietnam_data <- fully_integrated_data %>%
  filter(ccode %in% c(816, 817, 818)) %>%
  group_by(Destination, ccode) %>%
  summarize(
    min_year = min(Year),
    max_year = max(Year),
    n_observations = n(),
    .groups = "drop"
  )

print("Time periods for Vietnam entities:")
## [1] "Time periods for Vietnam entities:"
print(vietnam_data)
## # A tibble: 3 × 5
##   Destination                      ccode min_year max_year n_observations
##   <chr>                            <dbl>    <dbl>    <dbl>          <int>
## 1 Viet Nam                           816     1977     2016             40
## 2 Viet Nam, Democratic Republic of   816     1960     1976              5
## 3 Viet Nam, Republic of              816     1960     1974             15
# Replace ccodes for Vietnam entities with their specific codes
fully_integrated_data <- fully_integrated_data %>%
  mutate(
    ccode = case_when(
      Destination == "Viet Nam" ~ 818,
      Destination == "Viet Nam, Democratic Republic of" ~ 816, 
      Destination == "Viet Nam, Republic of" ~ 817,
      TRUE ~ ccode
    )
  )
# Step 1: Identify states where all Polity 5 variables are NA
states_missing_polity <- fully_integrated_data %>%
  group_by(Destination, ccode) %>%
  summarize(
    row_count = n(),
    all_polity_missing = all(is.na(polity) & is.na(polity2) & is.na(democ) & 
                             is.na(autoc) & is.na(durable) & is.na(xconst)),
    .groups = "drop"
  ) %>%
  filter(all_polity_missing == TRUE) %>%
  arrange(Destination)

# Print summary
print(paste("Number of states with all Polity variables missing:", nrow(states_missing_polity)))
## [1] "Number of states with all Polity variables missing: 32"
# Step 2: Check if these states exist in polity5_filtered by ccode
polity5_ccodes <- unique(polity5_filtered$ccode)
polity5_countries <- unique(polity5_filtered$country)

# Add columns to indicate if the state exists in polity5_filtered
states_missing_polity <- states_missing_polity %>%
  mutate(
    exists_by_ccode = ccode %in% polity5_ccodes,
    # Create a standardized name column to help with matching
    dest_standardized = gsub("[^[:alnum:]]", "", tolower(Destination))
  )

# Create standardized country names for polity5 data
polity5_countries_std <- gsub("[^[:alnum:]]", "", tolower(polity5_countries))

# Check if they exist by name (approximate matching)
states_missing_polity <- states_missing_polity %>%
  mutate(
    exists_by_name = dest_standardized %in% polity5_countries_std
  ) %>%
  select(-dest_standardized) # Remove the temporary column
# Step 3: Categorize the results
states_missing_polity <- states_missing_polity %>%
  mutate(
    status = case_when(
      exists_by_ccode ~ "In Polity5 (matched by ccode)",
      exists_by_name ~ "In Polity5 (matched by name, but ccode differs)",
      TRUE ~ "Not in Polity5 dataset"
    )
  )

# Print the detailed results
print("States missing Polity data and their status:")
## [1] "States missing Polity data and their status:"
print(states_missing_polity)
## # A tibble: 32 × 7
##    Destination ccode row_count all_polity_missing exists_by_ccode exists_by_name
##    <chr>       <dbl>     <int> <lgl>              <lgl>           <lgl>         
##  1 Andorra       232        23 TRUE               FALSE           FALSE         
##  2 Antigua an…    58        32 TRUE               FALSE           FALSE         
##  3 Bahamas        31        41 TRUE               FALSE           FALSE         
##  4 Barbados       53        50 TRUE               FALSE           FALSE         
##  5 Belize         80        34 TRUE               FALSE           FALSE         
##  6 Brunei        835        33 TRUE               FALSE           FALSE         
##  7 Dominica       54        30 TRUE               FALSE           FALSE         
##  8 Grenada        55        37 TRUE               FALSE           FALSE         
##  9 Iceland       395        57 TRUE               FALSE           FALSE         
## 10 Kiribati      946        18 TRUE               FALSE           FALSE         
## # ℹ 22 more rows
## # ℹ 1 more variable: status <chr>
# Step 4: Summarize the findings
summary_table <- states_missing_polity %>%
  group_by(status) %>%
  summarize(
    count = n(),
    .groups = "drop"
  )

print("Summary of findings:")
## [1] "Summary of findings:"
print(summary_table)
## # A tibble: 2 × 2
##   status                        count
##   <chr>                         <int>
## 1 In Polity5 (matched by ccode)     4
## 2 Not in Polity5 dataset           28
# Step 5: For states that exist in Polity5 but have missing data in the integrated dataset,
# Check their actual ccodes in polity5_filtered to identify potential mismatches
mismatched_states <- states_missing_polity %>%
  filter(exists_by_ccode) %>%
  select(Destination, ccode)

if(nrow(mismatched_states) > 0) {
  print("States that exist in Polity5 but have missing data in the integrated dataset:")
  for(i in 1:nrow(mismatched_states)) {
    state_ccode <- mismatched_states$ccode[i]
    polity_years <- polity5_filtered %>%
      filter(ccode == state_ccode) %>%
      summarize(
        min_year = min(year),
        max_year = max(year),
        country_name = first(country)
      )
    
    print(paste0("- ", mismatched_states$Destination[i], 
                 " (ccode ", state_ccode, "): ",
                 "In Polity5 as '", polity_years$country_name, "' ",
                 "from ", polity_years$min_year, 
                 " to ", polity_years$max_year))
  }
}
## [1] "States that exist in Polity5 but have missing data in the integrated dataset:"
## [1] "- Kosovo (ccode 347): In Polity5 as 'Yugoslavia' from 1991 to 2006"
## [1] "- Serbia (ccode 345): In Polity5 as 'Yugoslavia' from 1960 to 1991"
## [1] "- Serbia and Montenegro (ccode 345): In Polity5 as 'Yugoslavia' from 1960 to 1991"
## [1] "- Viet Nam (ccode 818): In Polity5 as 'Vietnam' from 1976 to 2016"
# Update the ccodes for Kosovo, Serbia, and Serbia and Montenegro
fully_integrated_data <- fully_integrated_data %>%
  mutate(
    ccode = case_when(
      Destination == "Kosovo" ~ 341,
      Destination == "Serbia" ~ 342,
      Destination == "Serbia and Montenegro" ~ 347,
      TRUE ~ ccode
    )
  )

# Verify the changes were applied correctly
verification <- fully_integrated_data %>%
  filter(Destination %in% c("Kosovo", "Serbia", "Serbia and Montenegro")) %>%
  select(Destination, ccode) %>%
  distinct()

print("Updated country codes:")
## [1] "Updated country codes:"
print(verification)
## # A tibble: 3 × 2
##   Destination           ccode
##   <chr>                 <dbl>
## 1 Kosovo                  341
## 2 Serbia                  342
## 3 Serbia and Montenegro   347
# First, remove the existing Polity5 variables from fully_integrated_data
# This prevents duplicate columns when we join again
fully_integrated_data <- fully_integrated_data %>%
  select(-democ, -autoc, -polity, -polity2, -durable, -xconst)

# Now rejoin with polity5_filtered using the updated country codes
fully_integrated_data_updated <- fully_integrated_data %>%
  left_join(
    polity5_filtered %>% 
      select(ccode, year, democ, autoc, polity, polity2, durable, xconst),
    by = c("ccode" = "ccode", "Year" = "year")
  )

# Check how many rows have Polity data now
print(paste("Total rows in dataset:", nrow(fully_integrated_data_updated)))
## [1] "Total rows in dataset: 9124"
print(paste("Rows with Polity data:", sum(!is.na(fully_integrated_data_updated$polity))))
## [1] "Rows with Polity data: 8194"
print(paste("Percentage with Polity data:", 
            round(mean(!is.na(fully_integrated_data_updated$polity)) * 100, 1), "%"))
## [1] "Percentage with Polity data: 89.8 %"
# Check specifically for Kosovo, Serbia, and Serbia and Montenegro
balkan_states <- fully_integrated_data_updated %>%
  filter(Destination %in% c("Kosovo", "Serbia", "Serbia and Montenegro")) %>%
  group_by(Destination, ccode) %>%
  summarize(
    total_rows = n(),
    rows_with_polity = sum(!is.na(polity)),
    percentage_with_polity = round(mean(!is.na(polity)) * 100, 1),
    .groups = "drop"
  )

print("Polity data for Balkan states after update:")
## [1] "Polity data for Balkan states after update:"
print(balkan_states)
## # A tibble: 3 × 5
##   Destination           ccode total_rows rows_with_polity percentage_with_polity
##   <chr>                 <dbl>      <int>            <int>                  <dbl>
## 1 Kosovo                  341          9                9                    100
## 2 Serbia                  342         10               10                    100
## 3 Serbia and Montenegro   347         15               15                    100
# Replace the old dataset with the updated one
fully_integrated_data <- fully_integrated_data_updated
# First, identify which states have all Polity 5 variables missing
states_with_missing_polity <- fully_integrated_data %>%
  group_by(Destination, ccode) %>%
  summarize(
    all_polity_missing = all(is.na(polity) & is.na(polity2) & is.na(democ) & 
                             is.na(autoc) & is.na(durable) & is.na(xconst)),
    .groups = "drop"
  ) %>%
  filter(all_polity_missing == TRUE)

# Get the list of states to filter out
states_to_remove <- states_with_missing_polity %>%
  select(Destination) %>%
  pull()

# Print info about what we're filtering
print(paste("Filtering out", length(states_to_remove), "states with missing Polity data"))
## [1] "Filtering out 28 states with missing Polity data"
print("States being removed:")
## [1] "States being removed:"
print(states_to_remove)
##  [1] "Andorra"                          "Antigua and Barbuda"             
##  [3] "Bahamas"                          "Barbados"                        
##  [5] "Belize"                           "Brunei"                          
##  [7] "Dominica"                         "Grenada"                         
##  [9] "Iceland"                          "Kiribati"                        
## [11] "Liechtenstein"                    "Maldives"                        
## [13] "Malta"                            "Marshall Islands"                
## [15] "Micronesia, Federated States of"  "Monaco"                          
## [17] "Nauru"                            "Palau"                           
## [19] "Saint Kitts and Nevis"            "Saint Lucia"                     
## [21] "Saint Vincent and the Grenadines" "Samoa"                           
## [23] "San Marino"                       "Sao Tome and Principe"           
## [25] "Seychelles"                       "Tonga"                           
## [27] "Tuvalu"                           "Vanuatu"
# Create the filtered dataset
fully_integrated_data_filtered <- fully_integrated_data %>%
  filter(!(Destination %in% states_to_remove))

# Check the results
print(paste("Original dataset rows:", nrow(fully_integrated_data)))
## [1] "Original dataset rows: 9124"
print(paste("Filtered dataset rows:", nrow(fully_integrated_data_filtered)))
## [1] "Filtered dataset rows: 8276"
print(paste("Rows removed:", nrow(fully_integrated_data) - nrow(fully_integrated_data_filtered)))
## [1] "Rows removed: 848"
# Check that all remaining states have at least some Polity data
remaining_missing_check <- fully_integrated_data_filtered %>%
  group_by(Destination) %>%
  summarize(
    all_polity_still_missing = all(is.na(polity)),
    .groups = "drop"
  ) %>%
  filter(all_polity_still_missing == TRUE)

print(paste("States remaining with all polity still missing:", nrow(remaining_missing_check)))
## [1] "States remaining with all polity still missing: 0"
# Use this filtered dataset for further analysis
fully_integrated_data <- fully_integrated_data_filtered
# Get a complete list of variables
names(fully_integrated_data)
##  [1] "Destination"             "Year"                   
##  [3] "recognition_count"       "avg_embassy_level"      
##  [5] "avg_lor"                 "total_potential_senders"
##  [7] "recognition_percentage"  "ccode"                  
##  [9] "milex"                   "milper"                 
## [11] "irst"                    "pec"                    
## [13] "tpop"                    "upop"                   
## [15] "cinc"                    "continent"              
## [17] "democ"                   "autoc"                  
## [19] "polity"                  "polity2"                
## [21] "durable"                 "xconst"
# Check data structure and types
str(fully_integrated_data)
## tibble [8,276 × 22] (S3: tbl_df/tbl/data.frame)
##  $ Destination            : chr [1:8276] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Year                   : num [1:8276] 1960 1961 1962 1963 1964 ...
##  $ recognition_count      : int [1:8276] 16 18 18 13 14 14 14 14 22 22 ...
##  $ avg_embassy_level      : num [1:8276] 5.31 5.22 5.94 1 1 ...
##  $ avg_lor                : num [1:8276] 0.906 0.889 0.931 0.75 0.75 ...
##  $ total_potential_senders: int [1:8276] 94 114 116 118 126 127 128 133 134 137 ...
##  $ recognition_percentage : num [1:8276] 17 15.8 15.5 11 11.1 ...
##  $ ccode                  : num [1:8276] 700 700 700 700 700 700 700 700 700 700 ...
##  $ milex                  : int [1:8276] 14240 14184 13125 14795 14544 13865 16819 17932 20067 20915 ...
##  $ milper                 : int [1:8276] 60 110 104 110 112 97 97 95 89 89 ...
##  $ irst                   : int [1:8276] 0 0 0 0 0 0 0 0 0 0 ...
##  $ pec                    : int [1:8276] 48 66 112 99 113 144 162 331 405 181 ...
##  $ tpop                   : num [1:8276] 10775 11014 11262 11521 11791 ...
##  $ upop                   : num [1:8276] 444 462 481 500 478 456 434 412 448 483 ...
##  $ cinc                   : num [1:8276] 0.00131 0.00176 0.00166 0.00166 0.00164 ...
##  $ continent              : chr [1:8276] "Asia" "Asia" "Asia" "Asia" ...
##  $ democ                  : num [1:8276] 0 0 0 0 0 0 0 0 0 0 ...
##  $ autoc                  : num [1:8276] 10 10 10 10 7 7 7 7 7 7 ...
##  $ polity                 : num [1:8276] -10 -10 -10 -10 -7 -7 -7 -7 -7 -7 ...
##  $ polity2                : num [1:8276] -10 -10 -10 -10 -7 -7 -7 -7 -7 -7 ...
##  $ durable                : num [1:8276] NA NA NA NA 0 1 2 3 4 5 ...
##  $ xconst                 : num [1:8276] 1 1 1 1 3 3 3 3 3 3 ...
# Write the dataset to a CSV file in the current working directory
write.csv(fully_integrated_data, file = "integrated_dataset.csv", row.names = FALSE)